/**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifdef ENABLE_ARM64
#include "nnacl/assembly_global.h"

.text
.align 5

// void PreSum4x16Int8Pert(const int8_t *src, int32_t *dst, size_t row4, size_t col16, int32_t filter_zp);

// x0 src
// x1 dst
// w2 row4
// w3 co16
// w4 filter_zp

asm_function PreSum4x16Int8Pert
  dup v17.4s, w4
  mov w5, #0

RowLoop:
  cmp w5, w2
  beq End
  add w5, w5, #4
  dup v16.4s, wzr
  mov w6, #0

CalLoop:
  cmp w6, w3
  beq Write
  add w6, w6, #16

  ld1 {v0.16b}, [x0], #16
  ld1 {v1.16b}, [x0], #16
  ld1 {v2.16b}, [x0], #16
  ld1 {v3.16b}, [x0], #16

  saddlp v4.8h, v0.16b
  saddlp v5.8h, v1.16b
  saddlp v6.8h, v2.16b
  saddlp v7.8h, v3.16b

  saddlp v0.4S, v4.8h
  saddlp v1.4S, v5.8h
  saddlp v2.4S, v6.8h
  saddlp v3.4S, v7.8h

  addv s4, v0.4S
  addv s5, v1.4S
  addv s6, v2.4S
  addv s7, v3.4S

  mov v0.s[0], v4.s[0]
  mov v0.s[1], v5.s[0]
  mov v0.s[2], v6.s[0]
  mov v0.s[3], v7.s[0]

  add v16.4s, v16.4s, v0.4s
  b CalLoop

Write:
  mul v16.4s, v16.4s, v17.4s
  st1 {v16.4s}, [x1], #16
  beq RowLoop

End:
  ret
#endif